Multiplex assessment of protein variant abundance by massively parallel sequencing VAMP-seq - multiplex assay that uses fluorescent reporters to measure the steady-state abundance of protein variants in cultured human cells (each cellexpresses a single variant directly fused to EGFP…the stability of the variant dictates the abundance of the EGFP fusion and, accordingly, the green fluorescence signal of the cell) - used to assess PTEN and TPMT variants

# graph VAMP-seq scores relative to variant position in protein
#pten
pten1_hbond <- pten1_proc[!is.na(pten1_proc$hbond_sum),]
pten1_hbond$secondary_struct <- ifelse(is.na(pten1_hbond$helix), "unknown",
                        ifelse(pten1_hbond$helix==1, "helix",
                        ifelse(pten1_hbond$sheet==1, "sheet",
                        ifelse(pten1_hbond$helix==0, "neither",
                        "unknown"))))
pten_plot_hbond <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score, colour=secondary_struct))+ geom_point(alpha=0.4) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + scale_color_manual(values=c("#FF4848", "#696969", "#5757FF")) + labs(colour="Secondary Structure") + theme_bw()
plot(pten_plot_hbond)

pten_plot_hbond1 <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score))+ geom_point(alpha = 0.2) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + theme_bw()
# was in aes, ggplot function call ---> colour=secondary_struct
#scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) + labs(colour="Secondary Structure")+
plot(pten_plot_hbond1)

#less hydrogen bonds ~ higher abundance
#method1 (basic, for visualizing in rstudio)
grid.newpage()
grid.draw(rbind(ggplotGrob(tpmt_dssp_schematic), ggplotGrob(tpmt_pos_mean), ggplotGrob(tpmt_heat), size = "last"))
Removed 1 rows containing missing values (geom_segment).Removed 1 rows containing missing values (geom_point).

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_dssp_schematic), ggplotGrob(pten_pos_mean), ggplotGrob(pten_heat), size = "last"))
Removed 12 rows containing missing values (geom_errorbar).

#plotting mean score vs variant changed to 
tpmt_end_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="end")
tpmt_end_mean <- ggplot(tpmt_end_sum, aes(x=end, y=score)) +
  geom_bar(position=position_dodge(), stat="identity", colour="#999999") +
  geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=0.001, position=position_dodge()) +
  ylab("mean abundance") + xlab("variant amino acid") + theme_bw()
plot(tpmt_end_mean)

pten_a_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_a_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
pten_r_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_r_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
pten_n_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_n_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
pten_d_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_d_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
pten_n_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()
pten_d_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()
plot(pten_a_spread1)

plot(pten_a_aa)

plot(pten_r_spread1)

plot(pten_r_aa)

plot(pten_n_spread1)

plot(pten_n_hydrodiff)

plot(pten_n_aa)

plot(pten_d_spread1)

plot(pten_d_hydrodiff)

plot(pten_d_aa)

ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Proline variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Proline variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Proline variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("T variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("T variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("T variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

#in order to find position with low stddev (excluding nonsense)
pten_variance <- summarySE((subset(pten1_data, end != "X")), measurevar="score", groupvars="position", na.rm=TRUE)
NaNs produced
#adjust N (minimum # variants at a position) and stddev
#5, 0.11
#10, 0.15
pten_variance_filtered <- subset(subset(pten_variance, N > 8 ), sd < 0.11)
pten_variance_filtered$position
 [1]  16  18  20  21  75 204 252 274 276 277 294 329 387 388 397
#nonsense variant scores are graphed as dots, but are not included in the overlaying violin plots
ggplot(no_nonsense, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered$position), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(no_nonsense, aes(y=score, x=position, colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

#diff parameter 
pten_variance_filtered1 <- subset(subset(pten_variance, N > 10), sd > 0.25)
no_nonsense1 <- subset(pten1_proc_wt, end != "X")
#nonsense variant scores are graphed as dots, but are not included in the overlaying violin plots
ggplot(no_nonsense1, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense1, no_nonsense1$position %in% pten_variance_filtered1$position), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(no_nonsense1, aes(y=score, x=position, colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense1, no_nonsense1$position %in% pten_variance_filtered1$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

chosen <- c(61, 68, 105, 108, 123, 127, 130, 132, 135, 155, 165, 173, 174, 246, 323, 335)
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, pten1_proc_wt$position %in% chosen), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Clinvar path/likely path variant positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% chosen), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

#pten_pos_mean <- ggplot(pten_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)
#pretty trashy
ggplot(no_nonsense, aes(y=score, x=position)) + 
  geom_bar(data=subset(no_nonsense1, no_nonsense$position %in% pten_variance_filtered1$position), position=position_dodge(), stat="identity", colour="#999999") +
  geom_errorbar(data=subset(no_nonsense1, no_nonsense$position %in% pten_variance_filtered1$position), aes(ymin=score-sd, ymax=score+sd), width=1, size=0.3, position=position_dodge()) +
  #geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense1, no_nonsense1$position %in% pten_variance_filtered1$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  #geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) +
  #scale_color_manual(values=twenty_color) + 
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")

pten_variance_filtered2 <- subset(subset(pten_variance, N > 10), sd > 0.35)
ggplot(no_nonsense, aes(y=sd, x=position)) +
  geom_bar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), position=position_dodge(), stat="identity", colour="#999999")

#all sd over .25, N>10
ggplot(pten_sum, aes(y=sd, x=position)) +
  geom_point(data=subset(pten_sum, pten_sum$position %in% pten_variance_filtered1$position)) +
  #geom_errorbar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered2$position), aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) + 
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")

#this is how _ is defined: pten_variance <- summarySE((subset(pten1_data, end != "X")), measurevar="score", groupvars="position", na.rm=TRUE)
pten_variance_filtered4 <-subset(pten_variance, N>5)
variance_bar <- ggplot(pten_variance_filtered4, aes(y=sd, x=position)) +
  geom_segment(aes(x=position, xend=position, y=0, yend=sd), color="grey68") +
  geom_point(size=0.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")
plot(variance_bar)

#all sd
ggplot(pten_sum, aes(y=sd, x=position)) +
  geom_segment(aes(x=position, xend=position, y=0, yend=sd), color="grey48") +
  #geom_point() +
  #geom_errorbar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered2$position), aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) + 
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")

NA
pten_sum_filt <- pten_sum
pten_sum_filt$sd2 = pten_variance_filtered4[match(pten_sum_filt$position, pten_variance_filtered4$position), "sd"]
pten_pos_colored_mean <- ggplot(pten_sum_filt, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour=pten_sum_filt$sd2) + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)+ scale_colour_gradient(low = "white", high = "yellow")
plot(pten_pos_colored_mean)

#method2 (use for final layout, size specification, download)
gd=ggplot_gtable(ggplot_build(variance_bar))
maxWidth = grid::unit.pmax(ga$widths, gb$widths, gc$widths, gd$widths)
ga$widths <- as.list(maxWidth)
gb$widths <- as.list(maxWidth)
gc$widths <- as.list(maxWidth)
gd$widths <- as.list(maxWidth)
grid.newpage()

#storing, with specified widths!!
pdf('pten_tpmt_mean_heat_variance.pdf', width=8, height=6)
#grid.arrange(arrangeGrob(gC,gA,gB,nrow=3,heights=c(.1,.3,.8)))
grid.arrange(arrangeGrob(gc,gd,ga,gb,nrow=4,heights=c(.1,.15,.15,.6)))
dev.off()
quartz_off_screen 
                3 

#Identifying items in tail to investigate
pten1_nonsense <- subset(pten1_proc, class == "nonsense")
tpmt1_nonsense <- subset(tpmt1_proc, class == "nonsense")
pten1_synon <- subset(pten1_proc, class == "synonymous")
tpmt1_synon <- subset(tpmt1_proc, class == "synonymous")
pten1_no_missense <- subset(pten1_proc, class == "synonymous" | class == "nonsense")
ggplot(pten1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") + theme_bw()

#+ geom_density()
ggplot(pten1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white") + theme_bw()

ggplot(pten1_proc_wt, aes(x=score)) + geom_histogram(data=subset(pten1_proc_wt,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "missense"), fill = "green", alpha = 0.2, binwidth=.01) + theme_bw()

ggplot(pten1_no_missense, aes(x=score)) + geom_histogram(data=subset(pten1_no_missense,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_no_missense,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + theme_bw()

ggplot(tpmt1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white") + theme_bw()

ggplot(tpmt1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") + theme_bw()

#0.55
nonsense_tail <- subset(pten1_nonsense, score > 0.6)
synon_tail <- subset(pten1_synon, score < 0.6)
nonsense_tail$secondary_struct <- ifelse(is.na(nonsense_tail$helix), "unknown",
                        ifelse(nonsense_tail$helix==1, "helix",
                        ifelse(nonsense_tail$sheet==1, "sheet",
                        ifelse(nonsense_tail$helix==0, "neither",
                        "unknown"))))
synon_tail$secondary_struct <- ifelse(is.na(synon_tail$helix), "unknown",
                        ifelse(synon_tail$helix==1, "helix",
                        ifelse(synon_tail$sheet==1, "sheet",
                        ifelse(synon_tail$helix==0, "neither",
                        "unknown"))))
#data[row,column]
n_tail <- nonsense_tail[,c(1,2,7,30,127)]
s_tail <- synon_tail[,c(1,2,7,30,127)]
n_tail$bp_pos <- (n_tail$position-1)*3
s_tail$bp_pos <- (s_tail$position-1)*3
n_tail
s_tail
#just in case there is a discernible pattern
s_tail_pos <- ggplot(s_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN synonymous variant tail scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1) + theme_bw()
plot(s_tail_pos)

#help visualizing NMD rules
n_tail_pos <- ggplot(n_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN nonsense variant tail scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1) + theme_bw()
plot(n_tail_pos)

TPMT_abun_CADD <- ggplot(tpmt_merge, aes(x=abundance_class, y=CADD_raw_rankscore)) + geom_violin(draw_quantiles = c( 0.5))+ylab("CADD raw rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_CADD)

TPMT_abun_SIFT_conv <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(SIFT_converted_rankscore))) + geom_violin(draw_quantiles = c(0.5))+ylab("SIFT conv rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_SIFT_conv)

TPMT_abun_POLY <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HDIV_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HDIV rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_POLY)

TPMT_abun_POLY1 <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HVAR_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HVAR rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_POLY1)

Pred_abun_SIFT <- ggplot(tpmt_merge, aes(abundance_class)) + geom_bar(aes(fill = SIFT_pred)) + ggtitle("Abundance class vs SIFT prediction of Damaging or Tolerated") + theme_bw()
plot(Pred_abun_SIFT)

trial_sep <- tpmt_merge[c(21,23,24,26)]
tpmt_merge_expand <- separate_rows(tpmt_merge, c("Polyphen2_HDIV_score", "Polyphen2_HDIV_pred", "Polyphen2_HVAR_score", "Polyphen2_HVAR_pred"))
Pred_abun_HVAR <- ggplot(tpmt_merge_expand, aes(abundance_class)) + geom_bar(aes(fill = Polyphen2_HVAR_pred)) + ggtitle("Abundance class vs Polyphen2 HVAR predictions") + labs(subtitle = "D: Probably Damaging, P: Possibly Damaging, B: Benign") + theme_bw()
plot(Pred_abun_HVAR)

twenty_color1 = c("#D02028", "#A4C33B","#53958B", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black', "#EDD941", "#F2F08E", "#EEC898", "#E1A12F", "#76C158",  "#BCDDAE", "#85782E", "#315935", "#A1DAE0", "#486EB6")
pten_dssp_schematic1 <- ggplot() +
  geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab("Position in PTEN") + ylab("\n \n \n") +
  theme(panel.border = element_blank(), axis.text.y = element_blank())

aas <- c("A", "C", "P", "X")
aas1 <- c("S", "C", "P", "X")
aas2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
pten_a_pos <- ggplot(data=subset(pten1_proc_wt, end=="A"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="A" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Alanine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
#plot(pten_a_pos)

pten_s_pos <- ggplot(data=subset(pten1_proc_wt, end=="S"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="S" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Serine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)

pten_c_pos <- ggplot(data=subset(pten1_proc_wt, end=="C"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="C" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Cysteine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)


pten_x_pos <- ggplot(data=subset(pten1_proc_wt, end=="X"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="X" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of nonsense variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_a_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_s_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_c_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_x_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
#snake at the end (arches over grey region)
pten_p_pos <- ggplot(data=subset(pten1_proc_wt, end=="P"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="P" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Proline variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_p_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

twenty_color1 = c("#D02028", "#A4C33B","#53958B", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black', "#EDD941", "#F2F08E", "#EEC898", "#E1A12F", "#76C158",  "#BCDDAE", "#85782E", "#315935", "#A1DAE0", "#486EB6")
pten_dssp_schematic1 <- ggplot() +
  geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab("Position in PTEN") + ylab("\n \n \n") +
  theme(panel.border = element_blank(), axis.text.y = element_blank())
aas <- c("A", "C", "P", "X")
aas1 <- c("S", "C", "P", "X")
aas2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
pten_a_pos <- ggplot(data=subset(pten1_proc_wt, end=="A"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="A" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Alanine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
#plot(pten_a_pos)
pten_s_pos <- ggplot(data=subset(pten1_proc_wt, end=="S"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="S" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Serine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
pten_c_pos <- ggplot(data=subset(pten1_proc_wt, end=="C"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="C" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Cysteine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
pten_x_pos <- ggplot(data=subset(pten1_proc_wt, end=="X"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="X" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of nonsense variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_a_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_s_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_c_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_x_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

#snake at the end (arches over grey region)
pten_p_pos <- ggplot(data=subset(pten1_proc_wt, end=="P"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="P" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Proline variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_p_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

#matching snake at the end!
pten_g_pos <- ggplot(data=subset(pten1_proc_wt, end=="G"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="G" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Glycine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_g_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

pten_h_pos <- ggplot(data=subset(pten1_proc_wt, end=="H"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="H" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Histidine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_h_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

#2nd from last grey region
pten_w_pos <- ggplot(data=subset(pten1_proc_wt, end=="W"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="W" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Typtophan variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_w_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

---
title: "PTEN R Notebook"
output: html_notebook
---
```{r global_options, include=FALSE}
library(knitr)
library(ggplot2)
require(gridExtra)
library(reshape2)
library(pracma)
library(ggbeeswarm)
library(Rmisc)
library(grid)
library(EBImage)
library(googlesheets)
library(tidyr)
library(dplyr)
knitr::opts_chunk$set(fig.width=12, fig.height=12, warning=FALSE)
```
Multiplex assessment of protein variant abundance by massively parallel sequencing
VAMP-seq
- multiplex assay that uses fluorescent reporters to measure the steady-state abundance of protein variants in cultured human cells (each cellexpresses a single variant directly fused to EGFP...the stability of the variant dictates the abundance of the EGFP fusion and, accordingly, the green fluorescence signal of the cell)
- used to assess PTEN and TPMT variants 
```{r echo=FALSE}
par(pch=20, cex=.6)
#local
pten1_data <- read.delim('~/leklab/leklab/pten1.txt')
pten1_proc <- pten1_data[!is.na(pten1_data$abundance_class),]
dd <- data.frame(pten1_proc$abundance_class,pten1_proc$score)
colnames(dd) <- c("abundance_class", "score")
#local
tpmt1_data <- read.delim('~/leklab/leklab/tpmt_suppl_2.txt')
tpmt1_proc <- tpmt1_data[!is.na(tpmt1_data$abundance_class),]
ee <- data.frame(tpmt1_proc$abundance_class,tpmt1_proc$score)
colnames(ee) <- c("abundance_class", "score")
dd$protein <- rep("PTEN", nrow(dd))
ee$protein <- rep("TPMT", nrow(ee))
ff = data.frame(rbind(dd, ee))
bbpp = boxplot(score~protein+abundance_class, data = ff, at = c(1, 1.8, 3, 3.8, 5, 5.8, 7.2, 8), xaxt='n', col = c('white', 'gray'))
axis(side=1, at=c(1.4, 3.4, 5.4, 7.6), labels=c('low', 'possibly low', 'possibly\n wt-like', 'wt-like'))
title('VAMP-seq scores of PTEN and TPMT Variants\nand abundance class')

```

```{r echo=FALSE}
#plots VAMP-seq score vs abundance_class

VAMP_abundance <- ggplot(ff, aes(x=abundance_class, y=score, fill=protein)) + geom_violin(draw_quantiles = 0.5)+ylab("VAMP-seq score")+xlab("Abundance Class")+theme(legend.title=element_blank(), panel.grid.major = element_line(colour = "grey"), panel.grid.minor = element_line(colour = "grey"))+ggtitle("VAMP-seq scores for each abundance classification")+geom_point(data=data.frame(x="wt-like", y=1, protein = "PTEN"), aes(x,y), colour="black", size=1.5, show.legend=FALSE)+annotate("text", x = "wt-like", y=1.09, label = "WT",colour= "black", size = 4) + scale_y_continuous(minor_breaks = seq(-2, 2, .25))+theme_bw()
plot(VAMP_abundance)
```

```{r echo=FALSE}
#combining pten1_data and tpmt1_data into one large data frame, differentiate between the two w/ column 'protein' which specifies 'PTEN' or 'TPMT'
pten1_data$protein <- rep("PTEN", nrow(pten1_data))
tpmt1_data$protein <- rep("TPMT", nrow(tpmt1_data))
common_cols <- intersect(colnames(pten1_data), colnames(tpmt1_data))
comb_data = rbind(subset(pten1_data, select = common_cols), subset(tpmt1_data, select = common_cols))

#plots helix vs score for PTEN and TPMT side by side
#no NA

comb_data_helix <- comb_data[!is.na(comb_data$helix),]
#check to see where 3759 rows went off to
ck <- comb_data_helix[!is.na(comb_data_helix$abundance_class),]
comb_data_sheet <- comb_data[!is.na(comb_data$sheet),]
ck1 <- comb_data_sheet[!is.na(comb_data_sheet$abundance_class),]

h_plot <- ggplot(ck, aes(x=as.factor(helix), y=score, fill=protein)) + geom_violin(data=subset(ck, helix==1), draw_quantiles = c(0.5)) + guides(fill=FALSE) + xlab("Alpha Helix") + ylab("VAMP-seq score") + theme(axis.text.x = element_blank()) + scale_y_continuous(limits = c(-.7, 2.03))

s_plot <- ggplot(ck1, aes(x=as.factor(sheet), y=score, fill=protein)) + geom_violin(data=subset(ck1, sheet==1), draw_quantiles = c(0.5)) +  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank()) + xlab("Beta Sheet") + scale_y_continuous(limits = c(-.7, 2.03)) + guides(fill=FALSE) 

n_plot <- ggplot(ck, aes(x=as.factor(helix), y=score, fill=protein)) + geom_violin(data=subset(ck, helix==0 & sheet==0), draw_quantiles = c( 0.5)) + theme( axis.title.y = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank(), legend.justification=c(1,0), legend.position=c(.49,.75), legend.title=element_blank(), legend.text = element_text(size=10)) + xlab("Other") + scale_y_continuous(limits = c(-.7, 2.03))

#put the plots side by side
combined <- grid.arrange(h_plot, s_plot, n_plot, ncol=3, top = "Variant scores in relation to position in protein")
##############
##save as pdf

# pdf("violin_Variant_scores_vs.pdf")
# plot(combined)
# plot(VAMP_abundance)
# dev.off()
##############
#works to save single
#ggsave("Variant_scores_protein_position.pdf", plot = combined, device = "pdf", path = "/Users/go2alyssa/Desktop/", scale = 2.6, dpi = "retina")

```

```{r echo=FALSE}
# graph VAMP-seq scores relative to variant position in protein
#pten
pten1_proc_wt <- pten1_proc[!is.na(pten1_proc$position),]
pten1_proc_wt$secondary_struct <- ifelse(is.na(pten1_proc_wt$helix), "unknown",
                        ifelse(pten1_proc_wt$helix==1, "helix",
                        ifelse(pten1_proc_wt$sheet==1, "sheet",
                        ifelse(pten1_proc_wt$helix==0, "neither",
                        "unknown"))))
pten_pos <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN scores in relation to protein structure") + geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)+theme_bw()

pten_hydro <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=(hydro2-hydro1)))+ geom_point(size=.3, alpha = 0.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Hydrophobicity")+ggtitle("PTEN scores in relation to change in hydrophobicity") + geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)


#tpmt
tpmt1_proc_wt <- tpmt1_proc[!is.na(tpmt1_proc$position),]
tpmt1_proc_wt$secondary_struct <- ifelse(is.na(tpmt1_proc_wt$helix), "unknown",
                        ifelse(tpmt1_proc_wt$helix==1, "helix",
                        ifelse(tpmt1_proc_wt$sheet==1, "sheet",
                        ifelse(tpmt1_proc_wt$helix==0, "neither",
                        "unknown"))))
tpmt_pos <- ggplot(tpmt1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)+theme_bw()

tpmt_colors <- tpmt1_proc_wt
#[order(position, variant),]
tpmt_colors$fact <- rep(10, nrow(tpmt_colors))
temp <- 1
for(i in 1:(length(tpmt_colors$fact)-1)) {
  if (tpmt_colors$secondary_struct[i] != tpmt_colors$secondary_struct[i+1]) {
    tpmt_colors$fact[i] <- temp
    temp <- temp + 1
  } else {
  tpmt_colors$fact[i] <- temp
  }
}
tpmt_colors$fact[length(tpmt_colors$fact)] <- temp

# cc <- 0
# for(i in 1:(length(tpmt_colors$fact)-1)) {
#   if (tpmt_colors$fact[i] != tpmt_colors$fact[i+1]) {
#     print(cc)
#     cc <- 0
#   } else {
#     cc <- cc + 1
#   }
# }

tpmt_pos_vp <- ggplot(tpmt_colors, aes(x=position, y=score))+ geom_violin(data=tpmt_colors[c(1:2783, 2798:4000),], aes(fill=as.character(fact), colour = factor(TRUE)), draw_quantiles = c(0.5), scale = "width") + 
scale_fill_manual(values=c("1" = "#A9A9A9", "2" = "#00C853", "3" = "#FF4848", "4" = "#00C853","5" = "#FF4848", "6" = "#00C853","7" = "#5757FF", "8" = "#00C853","9" = "#FF4848","10" = "#00C853","11" = "#5757FF", "12" = "#00C853","13" = "#FF4848", "14" = "#00C853", "15" = "#5757FF", "16" = "#00C853", "17" = "#5757FF", "18" = "#00C853", "19" = "#5757FF", "20" = "#00C853", "21" = "#5757FF", "22" = "#00C853", "23" = "#FF4848", "24" = "#5757FF", "25" = "#00C853", "26" = "#FF4848", "27" = "#00C853", "28" = "#5757FF", "29" = "#00C853", "30" = "#5757FF", "31" = "#00C853")) + scale_colour_manual(values = c("black")) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)


pten_hydro1 <- ggplot(pten1_proc_wt, aes(y=score, x=(hydro2-hydro1)))+ geom_point(size=0.5, alpha = 0.3) + ylab("Hydrophobicity")+xlab("VAMP-seq score")+ggtitle("PTEN scores in relation to change in hydrophobicity")

pten_aa_spread <- ggplot(pten1_proc_wt[2:288,], aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5))
pten_aa_spread1 <- ggplot(pten1_proc_wt[2:288,], aes(y=score, x=start)) + geom_point(size = 0.5)


plot(pten_pos)
plot(tpmt_pos)

#plot(tpmt_pos_vp)
#plot(pten_hydro)
#plot(pten_hydro1)
#plot(pten_aa_spread)
#plot(pten_aa_spread1)
```
```{r}
# graph VAMP-seq scores relative to variant position in protein
#pten
pten1_hbond <- pten1_proc[!is.na(pten1_proc$hbond_sum),]
pten1_hbond$secondary_struct <- ifelse(is.na(pten1_hbond$helix), "unknown",
                        ifelse(pten1_hbond$helix==1, "helix",
                        ifelse(pten1_hbond$sheet==1, "sheet",
                        ifelse(pten1_hbond$helix==0, "neither",
                        "unknown"))))
pten_plot_hbond <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score, colour=secondary_struct))+ geom_point(alpha=0.4) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + scale_color_manual(values=c("#FF4848", "#696969", "#5757FF")) + labs(colour="Secondary Structure") + theme_bw()
plot(pten_plot_hbond)

pten_plot_hbond1 <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score))+ geom_point(alpha = 0.2) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + theme_bw()

# was in aes, ggplot function call ---> colour=secondary_struct
#scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) + labs(colour="Secondary Structure")+

plot(pten_plot_hbond1)
#less hydrogen bonds ~ higher abundance
```
```{r include=FALSE}
#TPMT
tpmt_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="position")
#head(tpmt_sum)
tpmt_pos_mean <- ggplot(tpmt_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))
#factor(position) is getting rid of some positions altogether on the graph
#tpmt_pos_mean <- ggplot(tpmt_sum, aes(x=factor(position), y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=0.001, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 250, 10))
tpmt_heat <- ggplot(tpmt1_proc_wt, aes(position, end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.7, 0.2, 1, 1.3, 2.03)))+ scale_x_continuous(breaks = seq(0, 245, 10), expand=c(0,0)) + theme(legend.position='bottom')+xlab("Position in TPMT") + scale_y_discrete(expand = c(0,0))
#scale_fill_gradient2(low="#3F7CB9", mid="white", high="#B21F4E", midpoint=1) 
#scale_fill_distiller(palette= "RdBu")
tpmt_dssp_schematic <- ggplot() + ggtitle("TPMT mean abundance scores") +
  geom_segment(aes(x = 1, y = 0, xend = max(tpmt1_data$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = tpmt1_data, aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(tpmt1_data, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(tpmt1_data, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
  theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())

#plots
plot(tpmt_pos_mean)
plot(tpmt_heat)
tpmt_dssp_schematic

#+ geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)

#grouping all variants in the same secondary structure together
tpmt_aa_sum <- summarySE(tpmt_colors, measurevar="score", groupvars="fact")
tpmt_aa_mean <- ggplot(tpmt_aa_sum, aes(x=fact, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 245, 10)) + xlab("each bar is a different secondary structure")
plot(tpmt_aa_mean)
```
```{r include=FALSE}
#PTEN
pten_sum <- summarySE(pten1_proc_wt, measurevar="score", groupvars="position")
#head(pten_sum)
pten_pos_mean <- ggplot(pten_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)
pten_heat <- ggplot(pten1_proc_wt, aes(position, end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.23, 0.42, 1, 1.2, 1.47)))+ scale_x_continuous(breaks = seq(0, 403, 20), expand=c(0,0)) + theme(legend.position='bottom')+xlab("Position in TPMT") + scale_y_discrete(expand = c(0,0))
#c(-0.7, 0.2, 1, 1.3, 2.03)
#c(-0.23, 0.42, 1, 1.2, 1.47)

#local
pten_extra <- read.table(file = '~/leklab/leklab/PTEN_positional_data.tsv', sep = '\t', header = TRUE)
pten_dssp_schematic <- ggplot() + ggtitle("PTEN mean abundance scores") +
  geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
  theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())

#plots
plot(pten_pos_mean)
plot(pten_heat)
pten_dssp_schematic

```


```{r include=FALSE}
#...pointless to represent data this way ...

#dbNSFP
#setup
tpmt_merge_copy <- tpmt_merge
tpmt_merge_expand_copy <- tpmt_merge_expand
#tpmt_merge_copy$SIFT_converted_rankscore[which(tpmt_merge_copy$SIFT_score == '.')] = NA
tpmt_merge_copy$SIFT_score <- as.numeric(tpmt_merge_copy$SIFT_score)
tpmt_merge_copy$SIFT_converted_rankscore <- as.numeric(tpmt_merge_copy$SIFT_converted_rankscore)
tpmt_merge_expand_copy$Polyphen2_HVAR_score <- as.numeric(tpmt_merge_expand_copy$Polyphen2_HVAR_score)

#sift_sum
tpmt_sift_sum <- summarySE(tpmt_merge_copy, measurevar="SIFT_score", groupvars="position")
tss_plot <- ggplot(tpmt_sift_sum, aes(x=position, y=SIFT_score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=SIFT_score-sd, ymax =SIFT_score+sd), width=1, size=0.3, position=position_dodge()) +ylab("dfNSFP SIFT score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))
plot(tss_plot)

#HVAR_sum
tpmt_hvar_sum <-summarySE(tpmt_merge_expand_copy, measurevar="Polyphen2_HVAR_score", groupvars="position")
ths_plot <- ggplot(tpmt_hvar_sum, aes(x=position, y=Polyphen2_HVAR_score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=Polyphen2_HVAR_score-sd, ymax =Polyphen2_HVAR_score+sd), width=1, size=0.3, position=position_dodge()) +ylab("dfNSFP HVAR score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))
plot(ths_plot)
```

```{r}
#method1 (basic, for visualizing in rstudio)
grid.newpage()
grid.draw(rbind(ggplotGrob(tpmt_dssp_schematic), ggplotGrob(tpmt_pos_mean), ggplotGrob(tpmt_heat), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_dssp_schematic), ggplotGrob(pten_pos_mean), ggplotGrob(pten_heat), size = "last"))
```
```{r include=FALSE}
#method2 (use for final layout, size specification, download)
gA=ggplot_gtable(ggplot_build(tpmt_pos_mean))
gB=ggplot_gtable(ggplot_build(tpmt_heat))
gC=ggplot_gtable(ggplot_build(tpmt_dssp_schematic))
ga=ggplot_gtable(ggplot_build(pten_pos_mean))
gb=ggplot_gtable(ggplot_build(pten_heat))
gc=ggplot_gtable(ggplot_build(pten_dssp_schematic))
maxWidth = grid::unit.pmax(gA$widths, gB$widths, gC$widths, ga$widths, gb$widths, gc$widths)
gA$widths <- as.list(maxWidth)
gB$widths <- as.list(maxWidth)
gC$widths <- as.list(maxWidth)
ga$widths <- as.list(maxWidth)
gb$widths <- as.list(maxWidth)
gc$widths <- as.list(maxWidth)

grid.newpage()

#storing, with specified widths!!
#pdf('pten_tpmt_mean_heat.pdf', width=8, height=6)
#grid.arrange(arrangeGrob(gC,gA,gB,nrow=3,heights=c(.1,.3,.8)))
#grid.arrange(arrangeGrob(gc,ga,gb,nrow=3,heights=c(.1,.3,.8)))
#dev.off()
```
```{r include=FALSE}
##attempt to order by abundance...
#tpmt_sum_o <- tpmt_sum[order(tpmt_sum$score),]
#tpmt_sum_o$position <- factor(tpmt_sum_o$position, levels=tpmt_sum_o$position[order(tpmt_sum_o$score)])
#tpmt_pos_mean_o <- ggplot(tpmt_sum_o, aes(x=factor(position), y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=0.001, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 250, 10))
#aaa <- tpmt1_proc_wt
#aaa$means <- tpmt_sum[match(aaa$position, tpmt_sum$position),2]
#bbb <- aaa[order(aaa$mean),]
#bbb$position <- as.factor(bbb$position)
#tpmt_heat_o <- ggplot(bbb, aes(x=factor(position), y=end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.7, 0.2, 1, 1.3, 2.03)))+ scale_x_discrete(breaks = seq(0, 250, 10)) + theme(legend.position='bottom')+xlab("Position in TPMT")

#aaa$position <- factor(aaa$position, levels=unique(aaa$position)[order(aaa$means)])
#tpmt_heat_o <- ggplot(aaa, aes(x=factor(mean), y=end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.7, 0.2, 1, 1.3, 2.03)))+ scale_x_discrete(breaks = seq(0, 250, 10)) + theme(legend.position='bottom')+xlab("Position in TPMT")
#tpmt_dssp_schematic <- ggplot() + ggtitle("TPMT mean abundance scores") +
#  geom_segment(aes(x = 1, y = 0, xend = max(tpmt1_data$position)), yend = 0, size = 1, color = "grey70") +
#  geom_point(data = tpmt1_data, aes(x = position, y = 0), color = "black", size = 1.8) +
#  geom_point(data = subset(tpmt1_data, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
#  geom_point(data = subset(tpmt1_data, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
#  scale_x_continuous(breaks = seq(0, 250, 10), expand = c(0,0)) +
#  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
#  theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
#tpmt_dssp_schematic
#plot(tpmt_pos_mean_o)
#plot(tpmt_heat_o)
```
```{r}
#plotting mean score vs variant changed to 
tpmt_end_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="end")
tpmt_end_mean <- ggplot(tpmt_end_sum, aes(x=end, y=score)) +
  geom_bar(position=position_dodge(), stat="identity", colour="#999999") +
  geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=0.001, position=position_dodge()) +
  ylab("mean abundance") + xlab("variant amino acid") + theme_bw()
plot(tpmt_end_mean)

```
```{r include=FALSE}
#plotting scores vs end colored by location/secondary structure
#tpmt_end_scores_b <- ggplot(data=subset(tpmt1_proc_wt, sheet==1), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, sheet==1), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, sheet==1), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#tpmt_end_scores_a <- ggplot(data=subset(tpmt1_proc_wt, helix==1), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, helix==1), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, helix==1), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#tpmt_end_scores_n <- ggplot(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#plot(tpmt_end_scores_b)
#plot(tpmt_end_scores_a)
#plot(tpmt_end_scores_n)

#tpmt_end_scores_60 <- ggplot(data=subset(tpmt1_proc_wt, position<=60), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position<=60), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position<=60), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_120 <- ggplot(data=subset(tpmt1_proc_wt, position>60 & position<=120), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>60 & position<=120), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>60 & position<=120), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_180 <- ggplot(data=subset(tpmt1_proc_wt, position>120 & position<=180), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>120 & position<=180), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>120 & position<=180), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_245 <- ggplot(data=subset(tpmt1_proc_wt, position>180 & position<=245), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>180 & position<=245), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>180 & position<=245), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#plot(tpmt_end_scores_60)
#plot(tpmt_end_scores_120)
#plot(tpmt_end_scores_180)
#plot(tpmt_end_scores_245)
```


```{r echo=FALSE}
set.seed(153)
jitter <- position_jitter(width = 1, height = NULL)
jitter1 <-position_jitter(width = 0.08, height = NULL)
jitter2 <- position_jitter(width=0.13, height = NULL)
twenty_color = c("#D02028", "#E1A12F", "#EDD941", "#F2F08E", "#EEC898", "#BCDDAE", "#A4C33B", "#76C158", "#85782E", "#315935", "#53958B", "#A1DAE0", "#486EB6", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black')

#pten_k_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=factor(position))) + xlab("Position in PTEN") + ggtitle("Lysine variant abundance scores")

pten_k_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Lysine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "K"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_k_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Lysine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "K"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_g_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Glycine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_g_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Glycine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
pten_g_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Glycine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

pten_c_spread <- ggplot(pten1_proc_wt, aes(y=score, x=position)) + geom_point(data=subset(pten1_proc_wt, start== "C")) + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores") + theme_bw()
#experiment_orig
#pten_c_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter) + scale_color_manual(values=c("#D02028", "#E1A12F", "#EDD941", "#F2F08E", "#EEC898", "#BCDDAE", "#A4C33B", "#76C158", "#85782E", "#315935", "#53958B", "#A1DAE0", "#486EB6", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black'))
#, scale = "count"
#+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.5)
pten_c_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
pten_c_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_c_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter1) + scale_color_distiller(palette = "Spectral") + theme_bw()
#in in geom_violin(aes()) -> colour = hydro1


#pten_s_spread <- ggplot(pten1_proc_wt, aes(y=score, x=position)) + geom_point(data=subset(pten1_proc_wt, start== "S")) + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")
#pten_s_spread1_old <- ggplot(pten1_proc_wt, aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position))) + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")
pten_s_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_s_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_s_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()
#graphing abundance vs change in hydrophobicity
pten_s_hh_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(hydro2-hydro1), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(hydro2-hydro1)), scale = "width") + xlab("Change in hydrophobicity") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(hydro2-hydro1), y=score), alpha = 0.75, position=jitter1) + theme_bw()
#pten_s_aa_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_voldiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = voldiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_poldiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = polaritydiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_weightdiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = weightdiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)

#specific amino acid tests
##plot(pten_k_spread)
##plot(pten_g_spread)
##plot(pten_c_spread)

##plot(pten_s_spread)
##plot(pten_s_spread1_old)

#plot(pten_s_hh_hydrodiff) #probably not very useful... does not take into account position anymore
##plot(pten_s_aa_hydrodiff)
##plot(pten_s_aa_voldiff)
##plot(pten_s_aa_poldiff)
##plot(pten_s_aa_weightdiff)


plot(pten_c_spread1)
plot(pten_c_aa)
plot(pten_c_hydrodiff)
plot(pten_s_spread1)
plot(pten_s_aa)
plot(pten_s_hydrodiff)
plot(pten_g_spread1)
plot(pten_g_aa)
plot(pten_g_hydrodiff)
plot(pten_k_spread1)
plot(pten_k_aa)
```

```{r}
pten_a_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_a_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_r_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_r_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_n_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_n_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_d_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
pten_d_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_n_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

pten_d_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

plot(pten_a_spread1)
plot(pten_a_aa)
plot(pten_r_spread1)
plot(pten_r_aa)
plot(pten_n_spread1)
plot(pten_n_hydrodiff)
plot(pten_n_aa)
plot(pten_d_spread1)
plot(pten_d_hydrodiff)
plot(pten_d_aa)

```
```{r}
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Proline variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Proline variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Proline variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()
```
```{r}
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("T variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("T variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("T variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

```


```{r}
#in order to find position with low stddev (excluding nonsense)
pten_variance <- summarySE((subset(pten1_data, end != "X")), measurevar="score", groupvars="position", na.rm=TRUE)

#adjust N (minimum # variants at a position) and stddev
#5, 0.11
#10, 0.15
pten_variance_filtered <- subset(subset(pten_variance, N > 8 ), sd < 0.11)
pten_variance_filtered$position


#nonsense variant scores are graphed as dots, but are not included in the overlaying violin plots
ggplot(no_nonsense, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered$position), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(no_nonsense, aes(y=score, x=position, colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

```
```{r}
#diff parameter 
pten_variance_filtered1 <- subset(subset(pten_variance, N > 10), sd > 0.25)

#nonsense variant scores are graphed as dots, but are not included in the overlaying violin plots
ggplot(no_nonsense, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered1$position), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(no_nonsense, aes(y=score, x=position, colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered1$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()



chosen <- c(61, 68, 105, 108, 123, 127, 130, 132, 135, 155, 165, 173, 174, 246, 323, 335)

ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, pten1_proc_wt$position %in% chosen), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Clinvar path/likely path variant positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% chosen), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

```
```{r}
#pten_pos_mean <- ggplot(pten_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)

#pretty trashy
ggplot(no_nonsense, aes(y=score, x=position)) + 
  geom_bar(data=subset(no_nonsense1, no_nonsense$position %in% pten_variance_filtered1$position), position=position_dodge(), stat="identity", colour="#999999") +
  geom_errorbar(data=subset(no_nonsense1, no_nonsense$position %in% pten_variance_filtered1$position), aes(ymin=score-sd, ymax=score+sd), width=1, size=0.3, position=position_dodge()) +
  #geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense1, no_nonsense1$position %in% pten_variance_filtered1$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  #geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) +
  #scale_color_manual(values=twenty_color) + 
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")

pten_variance_filtered2 <- subset(subset(pten_variance, N > 10), sd > 0.35)

ggplot(no_nonsense, aes(y=sd, x=position)) +
  geom_bar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), position=position_dodge(), stat="identity", colour="#999999")


#all sd over .25, N>10
ggplot(pten_sum, aes(y=sd, x=position)) +
  geom_point(data=subset(pten_sum, pten_sum$position %in% pten_variance_filtered1$position)) +
  #geom_errorbar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered2$position), aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) + 
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")


#this is how _ is defined: pten_variance <- summarySE((subset(pten1_data, end != "X")), measurevar="score", groupvars="position", na.rm=TRUE)
pten_variance_filtered4 <-subset(pten_variance, N>5)
variance_bar <- ggplot(pten_variance_filtered4, aes(y=sd, x=position)) +
  geom_segment(aes(x=position, xend=position, y=0, yend=sd), color="grey68") +
  geom_point(size=0.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")
plot(variance_bar)


#all sd
ggplot(pten_sum, aes(y=sd, x=position)) +
  geom_segment(aes(x=position, xend=position, y=0, yend=sd), color="grey48") +
  #geom_point() +
  #geom_errorbar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered2$position), aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) + 
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")
  
```
```{r}
#i'm so tired
pten_sum_filt <- pten_sum
pten_sum_filt$sd2 = pten_variance_filtered4[match(pten_sum_filt$position, pten_variance_filtered4$position), "sd"]
pten_pos_colored_mean <- ggplot(pten_sum_filt, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour=pten_sum_filt$sd2) + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)+ scale_colour_gradient(low = "white", high = "blue")
plot(pten_pos_colored_mean)
```


```{r}
#method2 (use for final layout, size specification, download)
gd=ggplot_gtable(ggplot_build(variance_bar))
maxWidth = grid::unit.pmax(ga$widths, gb$widths, gc$widths, gd$widths)
ga$widths <- as.list(maxWidth)
gb$widths <- as.list(maxWidth)
gc$widths <- as.list(maxWidth)
gd$widths <- as.list(maxWidth)
grid.newpage()

#storing, with specified widths!!
pdf('pten_tpmt_mean_heat_variance.pdf', width=8, height=6)
#grid.arrange(arrangeGrob(gC,gA,gB,nrow=3,heights=c(.1,.3,.8)))
grid.arrange(arrangeGrob(gc,gd,ga,gb,nrow=4,heights=c(.1,.15,.15,.6)))
dev.off()
```


```{r include=FALSE}
#on hold, creation of amino acid dataframe to plot abundance scores against
name <- c('Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Glu', 'Gln', 'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val')
quality <- c('Hydrophobic', 'Basic', 'Polar Neutral', 'Acidic', 'Polar Neutral', 'Acidic', 'Polar Neutral', 'Glycine', 'Basic', 'Hydrophobic', 'Hydrophobic', 'Basic', 'Hydrophobic', 'Hydrophobic', 'Hydrophobic', 'Polar Neutral', 'Polar Neutral', 'Hydrophobic', 'Hydrophobic', 'Hydrophobic')
#abundance <- get better scale
abundance <- c(0.0884, 0.057, 0.0417, 0.0539, 0.0124, 0.0624, 0.0382, 0.0703, 0.0220, 0.0595, 0.0994, 0.0527, 0.0237, 0.04, 0.0471, 0.0672, 0.0543, 0.0121, 0.03, 0.0677)
#isoelectric point <- unknown source (ncbi)
isoelectric <- c(6, 10.8, 5.4, 3, 5, 3.2, 5.7, 6, 7.6, 6, 6, 9.7, 5.7, 5.5, 6.3, 5.7, 5.6, 5.9, 5.7, 6.0)
hp_k_d <- c(1.8, -4.5, -3.5, -3.5, 2.5, -3.5, -3.5, -0.4, -3.2, 4.5, 3.8, -3.9, 1.9, 2.8, -1.6, -0.8, -0.7, -0.9, -1.3, 4.2)
hp_janin <-c(0.3, -1.4, -0.5, -0.6, 0.9, -0.7, -0.7, 0.3, -0.1, 0.7, 0.5, -1.8, 0.4, 0.5, -0.3, -0.1, -0.2, 0.3, -0.4, 0.6)
#Monera et al., J. Protein Sci (pro (-46) may be sketch)
hp_ph7 <- c(41, -14, -28, -55, 49, -31, -10, 0, 8, 99, 97, -23, 74, 100, -46, -5, 13, 97, 63, 76)
h_bonds <- c(0, 7, 5, 4, 0, 4, 5, 0, 3, 0, 0, 3, 0, 0, 0, 3, 3, 1, 3, 0)
mol_weight <-c(71, 156, 114, 115, 103, 129, 128, 57, 137, 113, 113, 128, 131, 147, 97, 87, 101, 186, 163, 99)

amino_acids.data <- data.frame(name, quality, abundance, isoelectric, hp_k_d, hp_janin, hp_ph7, h_bonds, mol_weight)

```

```{r}
#Identifying items in tail to investigate
pten1_nonsense <- subset(pten1_proc, class == "nonsense")
tpmt1_nonsense <- subset(tpmt1_proc, class == "nonsense")
pten1_synon <- subset(pten1_proc, class == "synonymous")
tpmt1_synon <- subset(tpmt1_proc, class == "synonymous")

pten1_no_missense <- subset(pten1_proc, class == "synonymous" | class == "nonsense")

ggplot(pten1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") + theme_bw()
#+ geom_density()
ggplot(pten1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white") + theme_bw()

ggplot(pten1_proc_wt, aes(x=score)) + geom_histogram(data=subset(pten1_proc_wt,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "missense"), fill = "green", alpha = 0.2, binwidth=.01) + theme_bw()

ggplot(pten1_no_missense, aes(x=score)) + geom_histogram(data=subset(pten1_no_missense,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_no_missense,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + theme_bw()

ggplot(tpmt1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white") + theme_bw()
ggplot(tpmt1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") + theme_bw()
```
```{r}
#0.55
nonsense_tail <- subset(pten1_nonsense, score > 0.6)
synon_tail <- subset(pten1_synon, score < 0.6)
nonsense_tail$secondary_struct <- ifelse(is.na(nonsense_tail$helix), "unknown",
                        ifelse(nonsense_tail$helix==1, "helix",
                        ifelse(nonsense_tail$sheet==1, "sheet",
                        ifelse(nonsense_tail$helix==0, "neither",
                        "unknown"))))
synon_tail$secondary_struct <- ifelse(is.na(synon_tail$helix), "unknown",
                        ifelse(synon_tail$helix==1, "helix",
                        ifelse(synon_tail$sheet==1, "sheet",
                        ifelse(synon_tail$helix==0, "neither",
                        "unknown"))))

#data[row,column]
n_tail <- nonsense_tail[,c(1,2,7,30,127)]
s_tail <- synon_tail[,c(1,2,7,30,127)]
n_tail$bp_pos <- (n_tail$position-1)*3
s_tail$bp_pos <- (s_tail$position-1)*3

n_tail
s_tail
```
```{r}
#just in case there is a discernible pattern
s_tail_pos <- ggplot(s_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN synonymous variant tail scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1) + theme_bw()
plot(s_tail_pos)

#help visualizing NMD rules
n_tail_pos <- ggplot(n_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN nonsense variant tail scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1) + theme_bw()
plot(n_tail_pos)
```

```{r include=FALSE}
s_tail$prob_AG_GT <- c(0, 1/6, 1/2, 0, 1/2, 1/6)
s_tail$prob_titv <- c(0, 2/3, 2/3, 0, 2/3, 1/3)
ggplot(n_tail, aes(x=position,y=score)) + geom_point() + geom_smooth(method = "lm")
ggplot(s_tail, aes(x=prob_titv,y=score)) + geom_point() + geom_smooth(method = "lm")
ggplot(s_tail, aes(y=prob_titv,x=score)) + geom_point() + geom_smooth(method = "lm")
rsq <- function (x, y) cor(x, y)^2
n_rsq <- rsq(n_tail$position, s_tail$score)
s_rsq <- rsq(s_tail$prob_titv, s_tail$score)
n_rsq
s_rsq
#no relationship...
```

```{r include=FALSE}

gs_ls()
tpmt_ruddle <- gs_title("TPMT_ruddle")
tpmt_read <- gs_read(ss=tpmt_ruddle, ws = "ruddle_tpmt_variants")
tpmt_ruddle_data <- as.data.frame(tpmt_read)
```
```{r echo=FALSE}
#reversing data to fit tpmt1_data
rever <- function(df=tpmt_ruddle_data){df<-df[dim(df)[1]:1,]}
tpmt_ruddle_data_rev = rever(tpmt_ruddle_data)

#creating variant column, equiv to tpmt1_data's
tpmt_ruddle_data_rev$variant <- do.call(paste, c(tpmt_ruddle_data_rev[c(5,24,6)], sep=""))

#making both tables smaller
tpmt_essential <- tpmt_ruddle_data_rev[,c(2,3,4,5,6,17,19,24,27,28,29,30,31,32,33,34,35,76,77,78,137)]
tpmt1_proc_ess <- tpmt1_proc_wt[,c(1,2,3,5,6,7,30,32,80)]

#merging tables with variant name
tpmt_merge <- merge(tpmt1_proc_ess, tpmt_essential, by="variant")

#comparing abundance scores with various scores in dbNSFP (contains annotations of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome)
tpmt_cor1 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(SIFT_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("SIFT score")+ggtitle("1") + theme_bw()
tpmt_cor1.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(SIFT_converted_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("SIFT converted rankscore")+ggtitle("1.5") + theme_bw()
tpmt_cor5 <- ggplot(tpmt_merge, aes(x=score, y=CADD_raw_rankscore))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("CADD raw rankscore")+ggtitle("5") + theme_bw()
tpmt_cor2 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HDIV_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HDIV score")+ggtitle("2") + theme_bw()
tpmt_cor3 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HVAR_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HVAR score")+ggtitle("3") + theme_bw()
tpmt_cor2.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HDIV_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HDIV rankscore")+ggtitle("2.5") + theme_bw()
tpmt_cor3.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HVAR_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HVAR rankscore")+ggtitle("3.5") + theme_bw()

#CADD_phred not worth

#plot(tpmt_cor5)
#plot(tpmt_cor1)
#plot(tpmt_cor1.5)
plot(tpmt_cor2)
plot(tpmt_cor3)
plot(tpmt_cor2.5)
plot(tpmt_cor3.5)
```
```{r}
TPMT_abun_CADD <- ggplot(tpmt_merge, aes(x=abundance_class, y=CADD_raw_rankscore)) + geom_violin(draw_quantiles = c( 0.5))+ylab("CADD raw rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_CADD)

TPMT_abun_SIFT_conv <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(SIFT_converted_rankscore))) + geom_violin(draw_quantiles = c(0.5))+ylab("SIFT conv rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_SIFT_conv)

TPMT_abun_POLY <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HDIV_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HDIV rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_POLY)

TPMT_abun_POLY1 <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HVAR_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HVAR rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_POLY1)
```
```{r}
Pred_abun_SIFT <- ggplot(tpmt_merge, aes(abundance_class)) + geom_bar(aes(fill = SIFT_pred)) + ggtitle("Abundance class vs SIFT prediction of Damaging or Tolerated") + theme_bw()
plot(Pred_abun_SIFT)

trial_sep <- tpmt_merge[c(21,23,24,26)]
tpmt_merge_expand <- separate_rows(tpmt_merge, c("Polyphen2_HDIV_score", "Polyphen2_HDIV_pred", "Polyphen2_HVAR_score", "Polyphen2_HVAR_pred"))

Pred_abun_HVAR <- ggplot(tpmt_merge_expand, aes(abundance_class)) + geom_bar(aes(fill = Polyphen2_HVAR_pred)) + ggtitle("Abundance class vs Polyphen2 HVAR predictions") + labs(subtitle = "D: Probably Damaging, P: Possibly Damaging, B: Benign") + theme_bw()
plot(Pred_abun_HVAR)
```

```{r include=FALSE}
#creation of b-score text files for pymol use
pten_pymol <- summarySE(pten1_data, measurevar="score", groupvars="position", na.rm=TRUE)
#score[404] is wt
write.table(pten_pymol$score[1:403], "pten_mean_scores_pymol.txt", sep="\n", row.names=F, col.names=F, na = "NaN")

tpmt_pymol <- summarySE(tpmt1_data, measurevar="score", groupvars="position", na.rm=TRUE)
#score[404] is wt
write.table(tpmt_pymol$score[1:245], "tpmt_mean_scores_pymol.txt", sep="\n", row.names=F, col.names=F, na = "NaN")
```

```{r}
twenty_color1 = c("#D02028", "#A4C33B","#53958B", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black', "#EDD941", "#F2F08E", "#EEC898", "#E1A12F", "#76C158",  "#BCDDAE", "#85782E", "#315935", "#A1DAE0", "#486EB6")
pten_dssp_schematic1 <- ggplot() +
  geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab("Position in PTEN") + ylab("\n \n \n") +
  theme(panel.border = element_blank(), axis.text.y = element_blank())

aas <- c("A", "C", "P", "X")
aas1 <- c("S", "C", "P", "X")
aas2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
pten_a_pos <- ggplot(data=subset(pten1_proc_wt, end=="A"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="A" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Alanine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
#plot(pten_a_pos)

pten_s_pos <- ggplot(data=subset(pten1_proc_wt, end=="S"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="S" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Serine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)

pten_c_pos <- ggplot(data=subset(pten1_proc_wt, end=="C"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="C" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Cysteine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)


pten_x_pos <- ggplot(data=subset(pten1_proc_wt, end=="X"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="X" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of nonsense variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_a_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_s_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_c_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_x_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
```
```{r}
#snake at the end (arches over grey region)
pten_p_pos <- ggplot(data=subset(pten1_proc_wt, end=="P"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="P" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Proline variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_p_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
```
```{r}
#matching snake at the end!
pten_g_pos <- ggplot(data=subset(pten1_proc_wt, end=="G"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="G" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Glycine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_g_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

pten_h_pos <- ggplot(data=subset(pten1_proc_wt, end=="H"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="H" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Histidine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_h_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

#2nd from last grey region
pten_w_pos <- ggplot(data=subset(pten1_proc_wt, end=="W"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="W" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Typtophan variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_w_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
```
```{r}

```


